Set-up

library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qc
library(org.Mm.eg.db) # To annotate the genenames
sce <- readRDS(here("processed", "sce.RDS"))

Gene QC

We start by removing genes not expressed in any cell

genes_beforeqc <- dim(sce)[1]
keep_feature <- rowSums(counts(sce) > 0) > 0
sce <- sce[keep_feature, ]
genes_beforeqc -dim(sce)[1] 
## [1] 6511

6511 cells are not expressed in any cell.

Add cell QC metrics to the sce

First are sorted the genenames and gene symbols, because the default ensembl notation is not very handy. And then save the mitochondrial genes as such.

if(!file.exists(here("processed", "sce_preliminary.RDS"))){
# obtain full genenames
genename <- mapIds(org.Mm.eg.db, keys = rownames(sce), keytype = "ENSEMBL", column = c("GENENAME"))
# Use the symbols as rownames
# first make gene names unique 
# TODO: save duplicate gene name list
symb_unique <- uniquifyFeatureNames(rownames(sce), rowData(sce)[, "Symbol"])
# Now they can be used as rownames
rownames(sce) <- symb_unique
# Add full gene names and the uniuqe symbols to the rowdata
rowData(sce)$symb_uniq <- symb_unique
rowData(sce)$gene_name <- genename
# Subset the mitochondrial genes
is_mito <- grepl("^mt-", rownames(sce))
}
<<<<<<< Updated upstream
## 'select()' returned 1:many mapping between keys and columns

Then we can use the scater package to add the quality per cell ## First try log for low thresholds but not for high. In low this prevents negative thresholds

=======

Then we can use the scater package to add the quality per cell

>>>>>>> Stashed changes
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce <- addPerCellQC(sce, subsets = list(mt=is_mito))
# Automated outlier detection
outlier_lib_low <- isOutlier(sce$total, log = TRUE, type = "lower")
outlier_expr_low <-
  isOutlier(sce$detected, log = TRUE, type = "lower")
outlier_lib_high <- isOutlier(sce$total, type = "higher")
outlier_expr_high <-
  isOutlier(sce$detected, type = "higher")
outlier_mt <- isOutlier(sce$subsets_mt_percent, type = "higher")
# total
outlier <-
  outlier_lib_low |
  outlier_expr_low |
  outlier_lib_high | outlier_expr_high | outlier_mt
# See how many cells are detected as outliers
DataFrame(
  lib_size_high = sum(outlier_lib_high),
  expression_high = sum(outlier_expr_high),
  lib_size_low = sum(outlier_lib_low),
  expression_low = sum(outlier_expr_low),
  mt_pct = sum(outlier_mt),
  total = sum(outlier)
)
# And the thresholds
attr(outlier_lib_high, "thresholds")
attr(outlier_expr_high, "thresholds")
attr(outlier_lib_low, "thresholds")
attr(outlier_expr_low, "thresholds")
attr(outlier_mt, "thresholds")
# TODO do the analysis separating by batch?
# Add if it is an outlier to the metadata
sce$outlier <- outlier

# Save the object at this stage with the label "preliminary analysis" 
# As only annotation addition has been done, without deleting anything yet.
saveRDS(sce, here("processed", "sce_preliminary.RDS"))
}else{
  sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}

Plots before QC

Diagnostic plots to visualize the data distribution.

Violin plots

plotColData(sce, x="Sample", y="sum", colour_by="outlier") + 
  ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="sum", colour_by="outlier") + 
  scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="detected", colour_by="outlier") + 
  scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plotColData(sce, x="Sample", y="subsets_mt_percent", colour_by="outlier") + ggtitle("Mitocchondrial percentatge") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Histograms

hist(
    sce$total,
    breaks = 100
)

hist(
    sce$detected,
    breaks = 100
)

plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="outlier")

plotColData(sce, x="sum", y="detected", colour_by="outlier")

plotColData(sce, x="sum", y="detected", colour_by="Sample")

PCA

Here we run a PCA using the information in the metadata instead of the gene expression. It is usefull to visualize the QC parametres.

sce <- runColDataPCA(sce,  variables = c("sum","detected", "subsets_mt_percent"))
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "outlier")

plotReducedDim(sce, dimred="PCA_coldata", colour_by = "Sample")

sce$chip <- as.character(sce$chip)
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "chip")